

-- 1) Create a SAS credential (include the leading '?') to access the Blob container with MAG data
CREATE DATABASE SCOPED CREDENTIAL blob_sas
WITH IDENTITY = 'SHARED ACCESS SIGNATURE',
     SECRET   = '?sv=2024-...&ss=bfqt&srt=co&sp=rl...&sig=...';
GO

-- 2) Create the PolyBase data source pointing at the container
CREATE EXTERNAL DATA SOURCE MyBlob
WITH (
    TYPE = HADOOP,
    LOCATION   = 'wasbs://[container name]@[blob name].blob.core.windows.net',
    CREDENTIAL = blob_sas
);
GO


-- CSV/TSV format used by your external tables
IF NOT EXISTS (SELECT 1 FROM sys.external_file_formats WHERE name = 'CsvFmt')
    CREATE EXTERNAL FILE FORMAT CsvFmt
    WITH (
        FORMAT_TYPE = DELIMITEDTEXT,
        FORMAT_OPTIONS (
            FIELD_TERMINATOR = '\t',
            STRING_DELIMITER = '0x00',   -- no quoting in your TSVs
            USE_TYPE_DEFAULT = FALSE,
            ENCODING = 'UTF8'
        )
    );
GO

IF NOT EXISTS (SELECT 1 FROM sys.external_file_formats WHERE name = 'CsvFmtQ')
    CREATE EXTERNAL FILE FORMAT CsvFmtQ
    WITH (
        FORMAT_TYPE = DELIMITEDTEXT,
        FORMAT_OPTIONS (
            FIELD_TERMINATOR = '\t',
            STRING_DELIMITER = '"',   -- handle any quoted fields safely
            USE_TYPE_DEFAULT = FALSE,
            ENCODING = 'UTF8'
        )
    );
GO

-- Parquet, in case you CTAS to lake
IF NOT EXISTS (SELECT 1 FROM sys.external_file_formats WHERE name = 'ParquetFmt')
    CREATE EXTERNAL FILE FORMAT ParquetFmt
    WITH (FORMAT_TYPE = PARQUET);
GO

/* ===============================
   Papers.txt  ->  dbo.Papers_xt + dbo.Papers_full
   =============================== */

/* Drop & recreate the EXTERNAL TABLE (via DMV pattern) */
IF EXISTS (
    SELECT 1
    FROM sys.external_tables
    WHERE name = N'Papers_xt' AND schema_id = SCHEMA_ID(N'dbo')
)
    EXEC (N'DROP EXTERNAL TABLE dbo.Papers_xt;');
GO

CREATE EXTERNAL TABLE dbo.Papers_xt
(
    PaperId               NVARCHAR(4000),
    [Rank]                NVARCHAR(4000),
    Doi                   NVARCHAR(4000),
    DocType               NVARCHAR(4000),
    PaperTitle            NVARCHAR(4000),
    OriginalTitle         NVARCHAR(4000),
    BookTitle             NVARCHAR(4000),
    [Year]                NVARCHAR(4000),   -- nullable
    [Date]                NVARCHAR(4000),   -- nullable
    OnlineDate            NVARCHAR(4000),   -- nullable
    Publisher             NVARCHAR(4000),
    JournalId             NVARCHAR(4000),   -- nullable
    ConferenceSeriesId    NVARCHAR(4000),   -- nullable
    ConferenceInstanceId  NVARCHAR(4000),   -- nullable
    Volume                NVARCHAR(4000),
    Issue                 NVARCHAR(4000),
    FirstPage             NVARCHAR(4000),
    LastPage              NVARCHAR(4000),
    ReferenceCount        NVARCHAR(4000),
    CitationCount         NVARCHAR(4000),
    EstimatedCitation     NVARCHAR(4000),
    OriginalVenue         NVARCHAR(4000),
    FamilyId              NVARCHAR(4000),   -- nullable
    FamilyRank            NVARCHAR(4000),   -- nullable (uint)
    DocSubTypes           NVARCHAR(4000),
    CreatedDate           NVARCHAR(4000)
)
WITH (
    LOCATION     = '/mag/Papers.txt',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmt,      -- FIELD_TERMINATOR = '\t', STRING_DELIMITER='0x00'
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

/* Drop & recreate the typed VIEW */
IF OBJECT_ID(N'dbo.Papers_full', N'V') IS NOT NULL
    DROP VIEW dbo.Papers_full;
GO

CREATE VIEW dbo.Papers_full AS
SELECT
    TRY_CAST(PaperId              AS BIGINT)    AS PaperId,               -- NOT NULL (U-SQL)
    TRY_CAST([Rank]               AS INT)       AS [Rank],                -- NOT NULL (uint -> INT)
    Doi,
    DocType,
    PaperTitle,
    OriginalTitle,
    BookTitle,
    TRY_CAST([Year]               AS INT)       AS [Year],                -- nullable
    TRY_CONVERT(DATETIME2(0), [Date])           AS [Date],                -- nullable
    TRY_CONVERT(DATETIME2(0), OnlineDate)       AS OnlineDate,            -- nullable
    Publisher,
    TRY_CAST(JournalId            AS BIGINT)    AS JournalId,             -- nullable
    TRY_CAST(ConferenceSeriesId   AS BIGINT)    AS ConferenceSeriesId,    -- nullable
    TRY_CAST(ConferenceInstanceId AS BIGINT)    AS ConferenceInstanceId,  -- nullable
    Volume,
    Issue,
    FirstPage,
    LastPage,
    TRY_CAST(ReferenceCount       AS BIGINT)    AS ReferenceCount,        -- NOT NULL
    TRY_CAST(CitationCount        AS BIGINT)    AS CitationCount,         -- NOT NULL
    TRY_CAST(EstimatedCitation    AS BIGINT)    AS EstimatedCitation,     -- NOT NULL
    OriginalVenue,
    TRY_CAST(FamilyId             AS BIGINT)    AS FamilyId,              -- nullable
    TRY_CAST(FamilyRank           AS INT)       AS FamilyRank,            -- nullable (uint -> INT)
    DocSubTypes,
    TRY_CONVERT(DATETIME2(0), CreatedDate)      AS CreatedDate            -- NOT NULL
FROM dbo.Papers_xt
WHERE
    -- enforce the U-SQL non-nullable numeric/datetime fields:
    TRY_CAST(PaperId           AS BIGINT)   IS NOT NULL
    AND TRY_CAST([Rank]        AS INT)      IS NOT NULL
    AND TRY_CAST(ReferenceCount AS BIGINT)  IS NOT NULL
    AND TRY_CAST(CitationCount  AS BIGINT)  IS NOT NULL
    AND TRY_CAST(EstimatedCitation AS BIGINT) IS NOT NULL
    AND TRY_CONVERT(DATETIME2(0), CreatedDate) IS NOT NULL;
GO


-- Drop any previous internal table
IF OBJECT_ID(N'dbo.Papers2_int', N'U') IS NOT NULL
    DROP TABLE dbo.Papers2_int;
GO

-- Create internal table with HASH(PaperId) and columnstore
CREATE TABLE dbo.Papers2_int
WITH (
    DISTRIBUTION = HASH(PaperId),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT PaperId, [Year], JournalId, CitationCount, EstimatedCitation
FROM dbo.Papers_full   
WHERE DocType = 'Journal' AND [Year] IS NOT NULL;
GO

-- Helpful stats so the optimizer sizes memory correctly
CREATE STATISTICS st_p2_pid  ON dbo.Papers2_int(PaperId);
CREATE STATISTICS st_p2_year ON dbo.Papers2_int([Year]);
GO


IF OBJECT_ID(N'dbo.Papers2', N'V') IS NOT NULL
    DROP VIEW dbo.Papers2;
GO
CREATE VIEW dbo.Papers2 AS
SELECT PaperId, [Year], JournalId, CitationCount, EstimatedCitation
FROM dbo.Papers2_int;
GO



/* =========================================================
   PaperFieldsOfStudy.txt  ->  dbo.PaperFieldsOfStudy_xt  +  dbo.PaperFieldsOfStudy
   ========================================================= */
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'PaperFieldsOfStudy_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.PaperFieldsOfStudy_xt;');
GO

CREATE EXTERNAL TABLE dbo.PaperFieldsOfStudy_xt
(
    PaperId        BIGINT,
    FieldOfStudyId BIGINT,
    Score          FLOAT
)
WITH (
    LOCATION    = '/advanced/PaperFieldsOfStudy.txt',
    DATA_SOURCE = MyBlob,
    FILE_FORMAT = CsvFmt,
    REJECT_TYPE = VALUE,
    REJECT_VALUE = 10000
);
GO

/* 1) Build an internal, hash-distributed copy */
IF OBJECT_ID(N'dbo.PaperFieldsOfStudy_int', N'U') IS NOT NULL
    DROP TABLE dbo.PaperFieldsOfStudy_int;
GO

CREATE TABLE dbo.PaperFieldsOfStudy_int
WITH (
    DISTRIBUTION = HASH(PaperId),          -- align joins on PaperId
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT PaperId, FieldOfStudyId, Score
FROM dbo.PaperFieldsOfStudy_xt;            -- your external table
GO

/* Helpful stats for better memory grants/estimates */
CREATE STATISTICS st_pfos_pid  ON dbo.PaperFieldsOfStudy_int(PaperId);
CREATE STATISTICS st_pfos_fos  ON dbo.PaperFieldsOfStudy_int(FieldOfStudyId);
CREATE STATISTICS st_pfos_score ON dbo.PaperFieldsOfStudy_int(Score);
GO

/* 2) Point the view name at the internal table */
IF OBJECT_ID(N'dbo.PaperFieldsOfStudy', N'V') IS NOT NULL
    DROP VIEW dbo.PaperFieldsOfStudy;
GO
CREATE VIEW dbo.PaperFieldsOfStudy AS
SELECT PaperId, FieldOfStudyId, Score
FROM dbo.PaperFieldsOfStudy_int;
GO



/* =========================================================
   FieldOfStudyChildren.txt  ->  dbo.FieldOfStudyChildren_xt  +  dbo.FieldOfStudyChildren
   ========================================================= */
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'FieldOfStudyChildren_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.FieldOfStudyChildren_xt;');
GO

CREATE EXTERNAL TABLE dbo.FieldOfStudyChildren_xt
(
    FieldOfStudyId      BIGINT,
    ChildFieldOfStudyId BIGINT
)
WITH (
    LOCATION    = '/advanced/FieldOfStudyChildren.txt',
    DATA_SOURCE = MyBlob,
    FILE_FORMAT = CsvFmt,
    REJECT_TYPE = VALUE,
    REJECT_VALUE = 10000
);
GO

IF OBJECT_ID(N'dbo.FieldOfStudyChildren', N'V') IS NOT NULL
    DROP VIEW dbo.FieldOfStudyChildren;
GO
CREATE VIEW dbo.FieldOfStudyChildren AS
SELECT * FROM dbo.FieldOfStudyChildren_xt;
GO


/* =========================================================
   FieldsOfStudy.txt (dirty Level) -> dbo.FieldsOfStudy_txt + dbo.FieldsOfStudy
   ========================================================= */
IF EXISTS (
    SELECT 1 FROM sys.external_tables
    WHERE name = N'FieldsOfStudy_txt' AND schema_id = SCHEMA_ID(N'dbo')
)
    EXEC (N'DROP EXTERNAL TABLE dbo.FieldsOfStudy_txt;');
GO

CREATE EXTERNAL TABLE dbo.FieldsOfStudy_txt
(
    FieldOfStudyId   NVARCHAR(4000),
    [Rank]           NVARCHAR(4000),
    NormalizedName   NVARCHAR(4000),
    DisplayName      NVARCHAR(4000),
    MainType         NVARCHAR(4000),
    [Level]          NVARCHAR(4000),
    PaperCount       NVARCHAR(4000),
    PaperFamilyCount NVARCHAR(4000),
    CitationCount    NVARCHAR(4000),
    CreatedDate      NVARCHAR(4000)
)
WITH (
    LOCATION     = '/advanced/FieldsOfStudy.txt',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmt,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- typed view (keeps rows that satisfy the non-nullable casts)
IF OBJECT_ID(N'dbo.FieldsOfStudy', N'V') IS NOT NULL
    DROP VIEW dbo.FieldsOfStudy;
GO
CREATE VIEW dbo.FieldsOfStudy AS
SELECT
    TRY_CAST(FieldOfStudyId   AS BIGINT)    AS FieldOfStudyId,     -- NOT NULL
    TRY_CAST([Rank]           AS INT)       AS [Rank],             -- NOT NULL (uint→INT)
    NormalizedName,
    DisplayName,
    MainType,
    TRY_CAST([Level]          AS INT)       AS [Level],            -- NOT NULL
    TRY_CAST(PaperCount       AS BIGINT)    AS PaperCount,         -- NOT NULL
    TRY_CAST(PaperFamilyCount AS BIGINT)    AS PaperFamilyCount,   -- NOT NULL
    TRY_CAST(CitationCount    AS BIGINT)    AS CitationCount,      -- NOT NULL
    TRY_CONVERT(DATETIME2(0), CreatedDate)  AS CreatedDate         -- NOT NULL
FROM dbo.FieldsOfStudy_txt
WHERE TRY_CAST(FieldOfStudyId   AS BIGINT)  IS NOT NULL
  AND TRY_CAST([Rank]           AS INT)     IS NOT NULL
  AND TRY_CAST([Level]          AS INT)     IS NOT NULL
  AND TRY_CAST(PaperCount       AS BIGINT)  IS NOT NULL
  AND TRY_CAST(PaperFamilyCount AS BIGINT)  IS NOT NULL
  AND TRY_CAST(CitationCount    AS BIGINT)  IS NOT NULL
  AND TRY_CONVERT(DATETIME2(0), CreatedDate) IS NOT NULL;
GO


/* =========================================================
   PaperAuthorAffiliations.txt (overflow guard)
   -> dbo.PaperAuthorAffiliations_txt + dbo.PaperAuthorAffiliations
   ========================================================= */
IF EXISTS (
    SELECT 1 FROM sys.external_tables
    WHERE name = N'PaperAuthorAffiliations_xt' AND schema_id = SCHEMA_ID(N'dbo')
)
    EXEC (N'DROP EXTERNAL TABLE dbo.PaperAuthorAffiliations_xt;');
GO

CREATE EXTERNAL TABLE dbo.PaperAuthorAffiliations_xt
(
    PaperId              NVARCHAR(4000),
    AuthorId             NVARCHAR(4000),
    AffiliationId        NVARCHAR(4000),  -- nullable
    AuthorSequenceNumber NVARCHAR(4000),
    OriginalAuthor       NVARCHAR(4000),
    OriginalAffiliation  NVARCHAR(4000)
)
WITH (
    LOCATION     = '/mag/PaperAuthorAffiliations.txt',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmt,
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- typed view
IF OBJECT_ID(N'dbo.PaperAuthorAffiliations', N'V') IS NOT NULL
    DROP VIEW dbo.PaperAuthorAffiliations;
GO
CREATE VIEW dbo.PaperAuthorAffiliations AS
SELECT
    TRY_CAST(PaperId              AS BIGINT)   AS PaperId,             -- NOT NULL in U-SQL
    TRY_CAST(AuthorId             AS BIGINT)   AS AuthorId,            -- NOT NULL
    TRY_CAST(AffiliationId        AS BIGINT)   AS AffiliationId,       -- nullable
    TRY_CAST(AuthorSequenceNumber AS INT)      AS AuthorSequenceNumber,-- NOT NULL (uint)
    OriginalAuthor,
    OriginalAffiliation
FROM dbo.PaperAuthorAffiliations_xt
WHERE TRY_CAST(PaperId AS BIGINT) IS NOT NULL
  AND TRY_CAST(AuthorId AS BIGINT) IS NOT NULL
  AND TRY_CAST(AuthorSequenceNumber AS INT) IS NOT NULL;
GO


/* =========================================================
   FieldsOfStudyToTopField.txt  ->  dbo.FieldsOfStudyToTopField_xt + dbo.FieldsOfStudyToTopField
   ========================================================= */
-- Drop & recreate the external table
IF EXISTS (
    SELECT 1 FROM sys.external_tables
    WHERE name = N'FieldsOfStudyToTopField_xt' AND schema_id = SCHEMA_ID(N'dbo')
)
    EXEC (N'DROP EXTERNAL TABLE dbo.FieldsOfStudyToTopField_xt;');
GO

CREATE EXTERNAL TABLE dbo.FieldsOfStudyToTopField_xt
(
    FieldOfStudyId   NVARCHAR(4000),
    OGFieldOfStudyId NVARCHAR(4000),   -- nullable
    OGFieldLevel     NVARCHAR(4000),   -- nullable
    OGFieldName      NVARCHAR(4000)
)
WITH (
    LOCATION     = '/advanced/FieldsOfStudyToTopField.txt',
    DATA_SOURCE  = MyBlob,
    FILE_FORMAT  = CsvFmt,             -- no quoting
    REJECT_TYPE  = VALUE,
    REJECT_VALUE = 1000000
);
GO

-- typed view (drops header & any malformed rows on the required key)
IF OBJECT_ID(N'dbo.FieldsOfStudyToTopField', N'V') IS NOT NULL
    DROP VIEW dbo.FieldsOfStudyToTopField;
GO
CREATE VIEW dbo.FieldsOfStudyToTopField AS
SELECT
    TRY_CAST(FieldOfStudyId   AS BIGINT) AS FieldOfStudyId,     -- required
    TRY_CAST(OGFieldOfStudyId AS BIGINT) AS OGFieldOfStudyId,   -- nullable in U-SQL
    TRY_CAST(OGFieldLevel     AS INT)    AS OGFieldLevel,       -- nullable in U-SQL
    OGFieldName
FROM dbo.FieldsOfStudyToTopField_xt
WHERE TRY_CAST(FieldOfStudyId AS BIGINT) IS NOT NULL;           -- filters header row
GO



/* =========================================================
   PaperReferences.txt  ->  dbo.PaperReferences_xt + dbo.PaperReferences
   ========================================================= */
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'PaperReferences_xt' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.PaperReferences_xt;');
GO

CREATE EXTERNAL TABLE dbo.PaperReferences_xt
(
    PaperId          BIGINT,
    PaperReferenceId BIGINT
)
WITH (
    LOCATION    = '/mag/PaperReferences.txt',
    DATA_SOURCE = MyBlob,
    FILE_FORMAT = CsvFmt,
    REJECT_TYPE = VALUE,
    REJECT_VALUE = 10000
);
GO

/* 1) INTERNAL copy hashed by the citing paper (PaperId) */
IF OBJECT_ID(N'dbo.PaperReferences_byCiting_int', N'U') IS NOT NULL
    DROP TABLE dbo.PaperReferences_byCiting_int;
GO
CREATE TABLE dbo.PaperReferences_byCiting_int
WITH (
    DISTRIBUTION = HASH(PaperId),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT PaperId, PaperReferenceId
FROM dbo.PaperReferences_xt;   -- external table already created
GO

/* Helpful stats */
CREATE STATISTICS st_ref_citing_pid  ON dbo.PaperReferences_byCiting_int(PaperId);
CREATE STATISTICS st_ref_citing_prid ON dbo.PaperReferences_byCiting_int(PaperReferenceId);
GO


/* 2) INTERNAL copy hashed by the cited paper (PaperReferenceId) */
IF OBJECT_ID(N'dbo.PaperReferences_byCited_int', N'U') IS NOT NULL
    DROP TABLE dbo.PaperReferences_byCited_int;
GO
CREATE TABLE dbo.PaperReferences_byCited_int
WITH (
    DISTRIBUTION = HASH(PaperReferenceId),
    CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT PaperId, PaperReferenceId
FROM dbo.PaperReferences_xt;
GO

/* Helpful stats */
CREATE STATISTICS st_ref_cited_prid ON dbo.PaperReferences_byCited_int(PaperReferenceId);
CREATE STATISTICS st_ref_cited_pid  ON dbo.PaperReferences_byCited_int(PaperId);
GO


/* 3) Views that point to the internal tables */
IF OBJECT_ID(N'dbo.PaperReferences', N'V') IS NOT NULL
    DROP VIEW dbo.PaperReferences;
GO
-- Keep the legacy name mapped to the most common pattern (by citing)
CREATE VIEW dbo.PaperReferences AS
SELECT PaperId, PaperReferenceId
FROM dbo.PaperReferences_byCiting_int;
GO

IF OBJECT_ID(N'dbo.PaperReferences_byCited', N'V') IS NOT NULL
    DROP VIEW dbo.PaperReferences_byCited;
GO
CREATE VIEW dbo.PaperReferences_byCited AS
SELECT PaperId, PaperReferenceId
FROM dbo.PaperReferences_byCited_int;
GO


/* PaperAuthorCount */
IF OBJECT_ID(N'dbo.PaperAuthorCount', N'V') IS NOT NULL
    DROP VIEW dbo.PaperAuthorCount;
GO
CREATE VIEW dbo.PaperAuthorCount AS
SELECT PaperId, MAX(AuthorSequenceNumber) AS AuthorCount
FROM   dbo.PaperAuthorAffiliations
GROUP  BY PaperId;
GO

/* PaperAuthor (distinct) */
IF OBJECT_ID(N'dbo.PaperAuthor', N'V') IS NOT NULL
    DROP VIEW dbo.PaperAuthor;
GO
CREATE VIEW dbo.PaperAuthor AS
SELECT DISTINCT PaperId, AuthorId, AuthorSequenceNumber
FROM   dbo.PaperAuthorAffiliations;
GO

/* PaperAuthorWithCount */
IF OBJECT_ID(N'dbo.PaperAuthorWithCount', N'V') IS NOT NULL
    DROP VIEW dbo.PaperAuthorWithCount;
GO
CREATE VIEW dbo.PaperAuthorWithCount AS
SELECT PaperId, AuthorId, AuthorSequenceNumber,
       COUNT(*) OVER(PARTITION BY PaperId) AS AuthorCount
FROM   dbo.PaperAuthor;
GO

/* first & last author views */
IF OBJECT_ID(N'dbo.PaperFirstAuthor', N'V') IS NOT NULL
    DROP VIEW dbo.PaperFirstAuthor;
GO
CREATE VIEW dbo.PaperFirstAuthor AS
SELECT PaperId, AuthorId
FROM   dbo.PaperAuthorWithCount
WHERE  AuthorSequenceNumber = 1;
GO

IF OBJECT_ID(N'dbo.PaperLastAuthor', N'V') IS NOT NULL
    DROP VIEW dbo.PaperLastAuthor;
GO
CREATE VIEW dbo.PaperLastAuthor AS
SELECT PaperId, AuthorId
FROM   dbo.PaperAuthorWithCount
WHERE  AuthorSequenceNumber = AuthorCount;
GO

/* References (2-step join) */
IF OBJECT_ID(N'dbo.PaperReferencesExt', N'V') IS NOT NULL
    DROP VIEW dbo.PaperReferencesExt;
GO
CREATE VIEW dbo.PaperReferencesExt AS
SELECT R.PaperId, R.PaperReferenceId, P.[Year] AS PaperYear
FROM   dbo.PaperReferences R
JOIN   dbo.Papers2         P ON R.PaperId = P.PaperId;
GO

IF OBJECT_ID(N'dbo.PaperReferencesExt2', N'V') IS NOT NULL
    DROP VIEW dbo.PaperReferencesExt2;
GO
CREATE VIEW dbo.PaperReferencesExt2 AS
SELECT R.PaperId, R.PaperReferenceId, R.PaperYear,
       P.[Year] AS PaperReferenceYear
FROM   dbo.PaperReferencesExt R
JOIN   dbo.Papers2            P ON R.PaperReferenceId = P.PaperId;
GO

/* Citation windows */
IF OBJECT_ID(N'dbo.PaperCitations', N'V') IS NOT NULL
    DROP VIEW dbo.PaperCitations;
GO
CREATE VIEW dbo.PaperCitations AS
SELECT DISTINCT
       PaperReferenceId AS PaperId,
       SUM(CASE WHEN PaperYear<=PaperReferenceYear+3  THEN 1 END)
            OVER(PARTITION BY PaperReferenceId) AS FCites_3year,
       SUM(CASE WHEN PaperYear<=PaperReferenceYear+5  THEN 1 END)
            OVER(PARTITION BY PaperReferenceId) AS FCites_5year,
       SUM(CASE WHEN PaperYear<=PaperReferenceYear+10 THEN 1 END)
            OVER(PARTITION BY PaperReferenceId) AS FCites_10year
FROM   dbo.PaperReferencesExt2;
GO

/* PaperDataWithCites */
IF OBJECT_ID(N'dbo.PaperDataWithCites', N'V') IS NOT NULL
    DROP VIEW dbo.PaperDataWithCites;
GO
CREATE VIEW dbo.PaperDataWithCites AS
SELECT B.PaperId,B.[Year],B.JournalId,
       B.CitationCount,B.EstimatedCitation,
       COALESCE(C.FCites_3year,0)  AS FCites_3year,
       COALESCE(C.FCites_5year,0)  AS FCites_5year,
       COALESCE(C.FCites_10year,0) AS FCites_10year
FROM   dbo.Papers2 B
LEFT   JOIN dbo.PaperCitations C ON B.PaperId = C.PaperId;
GO

/* PaperDataRanked */
IF OBJECT_ID(N'dbo.PaperDataRanked', N'V') IS NOT NULL
    DROP VIEW dbo.PaperDataRanked;
GO
CREATE VIEW dbo.PaperDataRanked AS
SELECT *, COUNT(*) OVER(PARTITION BY [Year]) AS CohortSize,
       RANK() OVER(PARTITION BY [Year] ORDER BY FCites_3year  DESC, PaperId) AS Rank_3yr,
       RANK() OVER(PARTITION BY [Year] ORDER BY FCites_5year  DESC, PaperId) AS Rank_5yr,
       RANK() OVER(PARTITION BY [Year] ORDER BY FCites_10year DESC, PaperId) AS Rank_10yr
FROM   dbo.PaperDataWithCites;
GO

/* PaperDataFlagged */
IF OBJECT_ID(N'dbo.PaperDataFlagged', N'V') IS NOT NULL
    DROP VIEW dbo.PaperDataFlagged;
GO
CREATE VIEW dbo.PaperDataFlagged AS
SELECT PaperId,[Year],JournalId,CitationCount,EstimatedCitation,
       FCites_3year,FCites_5year,FCites_10year,
       CASE WHEN Rank_3yr <= CEILING(0.05*CohortSize) THEN 1 ELSE 0 END AS Top5pct_3yr,
       CASE WHEN Rank_5yr <= CEILING(0.05*CohortSize) THEN 1 ELSE 0 END AS Top5pct_5yr,
       CASE WHEN Rank_10yr<= CEILING(0.05*CohortSize) THEN 1 ELSE 0 END AS Top5pct_10yr
FROM   dbo.PaperDataRanked;
GO

/* Paper_Fields */
IF OBJECT_ID(N'dbo.Paper_Fields', N'V') IS NOT NULL
    DROP VIEW dbo.Paper_Fields;
GO
CREATE VIEW dbo.Paper_Fields AS
SELECT
    P.PaperId,
    P.[Year],
    F.FieldOfStudyId,
    F.Score
FROM dbo.Papers2             AS P
JOIN dbo.PaperFieldsOfStudy  AS F
     ON P.PaperId = F.PaperId
GO

/* Level4Fields */
IF OBJECT_ID(N'dbo.Level4Fields', N'V') IS NOT NULL
    DROP VIEW dbo.Level4Fields;
GO
CREATE VIEW dbo.Level4Fields AS
SELECT FieldOfStudyId,
       [Level]
FROM   dbo.FieldsOfStudy
WHERE  [Level] = 4;
GO

/* Level5FieldsParents */
IF OBJECT_ID(N'dbo.Level5FieldsParents', N'V') IS NOT NULL
    DROP VIEW dbo.Level5FieldsParents;
GO
CREATE VIEW dbo.Level5FieldsParents AS
SELECT DISTINCT
       F.FieldOfStudyId,                 -- level-5 field
       C.FieldOfStudyId AS ParentFieldOfStudyId   -- its level-4 parent
FROM   dbo.FieldsOfStudy          AS F
JOIN   dbo.FieldOfStudyChildren   AS C
       ON  F.FieldOfStudyId = C.ChildFieldOfStudyId
WHERE  F.[Level] = 5;
GO

/* Level4FieldsCrossWalk */
IF OBJECT_ID(N'dbo.Level4FieldsCrossWalk', N'V') IS NOT NULL
    DROP VIEW dbo.Level4FieldsCrossWalk;
GO
CREATE VIEW dbo.Level4FieldsCrossWalk AS
SELECT FieldOfStudyId,
       FieldOfStudyId AS Level4Field
FROM   dbo.Level4Fields
UNION
SELECT FieldOfStudyId,
       ParentFieldOfStudyId AS Level4Field
FROM   dbo.Level5FieldsParents;
GO

/* PaperTopField */
IF OBJECT_ID(N'dbo.PaperTopField', N'V') IS NOT NULL
    DROP VIEW dbo.PaperTopField;
GO
CREATE VIEW dbo.PaperTopField AS
SELECT P.PaperId,
       F.OGFieldOfStudyId
FROM   dbo.PaperFieldsOfStudy     AS P
JOIN   dbo.FieldsOfStudyToTopField AS F
       ON P.FieldOfStudyId = F.FieldOfStudyId
WHERE  F.OGFieldLevel = 0;
GO


/* ──────────────────────────────────────────────────────────────────────────
   Ensure the Level4 crosswalk is materialized once (REPLICATE is ideal)
   ────────────────────────────────────────────────────────────────────────── */
IF OBJECT_ID(N'dbo.Level4FieldsCrossWalk_int', N'U') IS NULL
BEGIN
    CREATE TABLE dbo.Level4FieldsCrossWalk_int
    WITH (DISTRIBUTION = REPLICATE, CLUSTERED COLUMNSTORE INDEX)
    AS
    SELECT FieldOfStudyId, Level4Field
    FROM dbo.Level4FieldsCrossWalk;  --  existing view over the crosswalk
END
GO

/* ──────────────────────────────────────────────────────────────────────────
   1) Paper_Refs  (two internal copies for local joins on either key)
   ────────────────────────────────────────────────────────────────────────── */

-- By *citing* paper → local joins on PaperId
IF OBJECT_ID(N'dbo.Paper_Refs_byCiting_int', N'U') IS NOT NULL
    DROP TABLE dbo.Paper_Refs_byCiting_int;
GO
CREATE TABLE dbo.Paper_Refs_byCiting_int
WITH (DISTRIBUTION = HASH(PaperId), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT  P.PaperId, P.[Year], R.PaperReferenceId
FROM    dbo.Papers2_int                     AS P    -- HASH(PaperId)
JOIN    dbo.PaperReferences_byCiting_int    AS R    -- HASH(PaperId)
        ON P.PaperId = R.PaperId;
GO
CREATE STATISTICS st_pr_cite_pid ON dbo.Paper_Refs_byCiting_int(PaperId);
CREATE STATISTICS st_pr_cite_year ON dbo.Paper_Refs_byCiting_int([Year]);
GO

-- By *cited* paper → local joins on PaperReferenceId
IF OBJECT_ID(N'dbo.Paper_Refs_byCited_int', N'U') IS NOT NULL
    DROP TABLE dbo.Paper_Refs_byCited_int;
GO
CREATE TABLE dbo.Paper_Refs_byCited_int
WITH (DISTRIBUTION = HASH(PaperReferenceId), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT  P.PaperId, P.[Year], R.PaperReferenceId
FROM    dbo.Papers2_int                  AS P
JOIN    dbo.PaperReferences_byCited_int  AS R    -- HASH(PaperReferenceId)
        ON P.PaperId = R.PaperId;
GO
CREATE STATISTICS st_pr_cited_prid ON dbo.Paper_Refs_byCited_int(PaperReferenceId);
GO

-- Keep  original view name pointing at the *citing* version
IF OBJECT_ID(N'dbo.Paper_Refs', N'V') IS NOT NULL
    DROP VIEW dbo.Paper_Refs;
GO
CREATE VIEW dbo.Paper_Refs AS
SELECT PaperId, [Year], PaperReferenceId
FROM dbo.Paper_Refs_byCiting_int;
GO


/* ──────────────────────────────────────────────────────────────────────────
   2) Paper_Ref_Fields  (join on referenced paper; result hashed by PaperId)
   ────────────────────────────────────────────────────────────────────────── */
IF OBJECT_ID(N'dbo.Paper_Ref_Fields_int', N'U') IS NOT NULL
    DROP TABLE dbo.Paper_Ref_Fields_int;
GO
CREATE TABLE dbo.Paper_Ref_Fields_int
WITH (DISTRIBUTION = HASH(PaperId), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT DISTINCT
       R.PaperId,        -- citing paper
       R.[Year],
       F.FieldOfStudyId
FROM   dbo.Paper_Refs_byCited_int   AS R          -- HASH(PaperReferenceId)
JOIN   dbo.PaperFieldsOfStudy_int   AS F          -- HASH(PaperId)
       ON R.PaperReferenceId = F.PaperId          -- local (same hashed value)
WHERE  F.Score >= 0.4;
GO
CREATE STATISTICS st_prf_pid ON dbo.Paper_Ref_Fields_int(PaperId);
CREATE STATISTICS st_prf_fid ON dbo.Paper_Ref_Fields_int(FieldOfStudyId);
GO



IF OBJECT_ID(N'dbo.Paper_Ref_Fields', N'V') IS NOT NULL
    DROP VIEW dbo.Paper_Ref_Fields;
GO
CREATE VIEW dbo.Paper_Ref_Fields AS
SELECT PaperId, [Year], FieldOfStudyId
FROM dbo.Paper_Ref_Fields_int;
GO


/* ──────────────────────────────────────────────────────────────────────────
   3) Paper_Ref_L4Fields  (lookup via replicated crosswalk; hash by PaperId)
   ────────────────────────────────────────────────────────────────────────── */
IF OBJECT_ID(N'dbo.Paper_Ref_L4Fields_int', N'U') IS NOT NULL
    DROP TABLE dbo.Paper_Ref_L4Fields_int;
GO
CREATE TABLE dbo.Paper_Ref_L4Fields_int
WITH (DISTRIBUTION = HASH(PaperId), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT DISTINCT
       P.PaperId,
       P.[Year],
       X.Level4Field AS L4FieldOfStudyId
FROM   dbo.Paper_Ref_Fields_int        AS P      -- HASH(PaperId)
JOIN   dbo.Level4FieldsCrossWalk_int   AS X      -- REPLICATE (no shuffle)
       ON P.FieldOfStudyId = X.FieldOfStudyId;
GO
CREATE STATISTICS st_prl_pid ON dbo.Paper_Ref_L4Fields_int(PaperId);
GO

IF OBJECT_ID(N'dbo.Paper_Ref_L4Fields', N'V') IS NOT NULL
    DROP VIEW dbo.Paper_Ref_L4Fields;
GO
CREATE VIEW dbo.Paper_Ref_L4Fields AS
SELECT PaperId, [Year], L4FieldOfStudyId
FROM dbo.Paper_Ref_L4Fields_int;
GO


/* ──────────────────────────────────────────────────────────────────────────
   4) PaperRefFieldPairs  (self-join on PaperId; keep hash by PaperId)
   ────────────────────────────────────────────────────────────────────────── */
IF OBJECT_ID(N'dbo.PaperRefFieldPairs_int', N'U') IS NOT NULL
    DROP TABLE dbo.PaperRefFieldPairs_int;
GO
CREATE TABLE dbo.PaperRefFieldPairs_int
WITH (DISTRIBUTION = HASH(PaperId), CLUSTERED COLUMNSTORE INDEX)
AS
SELECT
    A.PaperId,
    A.[Year],
    A.L4FieldOfStudyId AS Field1,
    B.L4FieldOfStudyId AS Field2
FROM dbo.Paper_Ref_L4Fields_int AS A   -- HASH(PaperId)
JOIN dbo.Paper_Ref_L4Fields_int AS B   -- HASH(PaperId)
  ON A.PaperId = B.PaperId
    AND A.L4FieldOfStudyId < B.L4FieldOfStudyId;
GO
CREATE STATISTICS st_pairs_pid ON dbo.PaperRefFieldPairs_int(PaperId);
CREATE STATISTICS st_pairs_f1  ON dbo.PaperRefFieldPairs_int(Field1);
CREATE STATISTICS st_pairs_f2  ON dbo.PaperRefFieldPairs_int(Field2);
GO

IF OBJECT_ID(N'dbo.PaperRefFieldPairs', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefFieldPairs;
GO
CREATE VIEW dbo.PaperRefFieldPairs AS
SELECT PaperId, [Year], Field1, Field2
FROM dbo.PaperRefFieldPairs_int;
GO


/* RefFieldPairsFreqPerYear */
IF OBJECT_ID(N'dbo.RefFieldPairsFreqPerYear', N'V') IS NOT NULL
    DROP VIEW dbo.RefFieldPairsFreqPerYear;
GO
CREATE VIEW dbo.RefFieldPairsFreqPerYear AS
SELECT
    Field1,
    Field2,
    [Year],
    COUNT(*) AS FreqPerYear
FROM   dbo.PaperRefFieldPairs
GROUP  BY Field1, Field2, [Year];
GO

/* RefFieldPairsCumFreq */
IF OBJECT_ID(N'dbo.RefFieldPairsCumFreq', N'V') IS NOT NULL
    DROP VIEW dbo.RefFieldPairsCumFreq;
GO
CREATE VIEW dbo.RefFieldPairsCumFreq AS
SELECT DISTINCT
       Field1,
       Field2,
       [Year],
       SUM(FreqPerYear) OVER (
           PARTITION BY Field1, Field2
           ORDER BY      [Year] ASC
           ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
       ) AS CumFreq
FROM   dbo.RefFieldPairsFreqPerYear;
GO

/* RefL4FieldPairsStats */
IF OBJECT_ID(N'dbo.RefL4FieldPairsStats', N'V') IS NOT NULL
    DROP VIEW dbo.RefL4FieldPairsStats;
GO
CREATE VIEW dbo.RefL4FieldPairsStats AS
SELECT
    F.Field1,
    F.Field2,
    F.[Year],
    F.FreqPerYear,
    (C.CumFreq - F.FreqPerYear) AS LaggedCumFreq
FROM   dbo.RefFieldPairsFreqPerYear AS F
JOIN   dbo.RefFieldPairsCumFreq     AS C
       ON  F.Field1 = C.Field1
       AND F.Field2 = C.Field2
       AND F.[Year] = C.[Year];
GO

/* L1-a  PaperRefL4FieldPairsExpandedL1  ────────*/
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL1', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL1;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL1 AS
SELECT
    P.PaperId,
    P.[Year],
    P.Field1,
    P.Field2,
    S.FreqPerYear   AS FreqPerYearL1,
    S.LaggedCumFreq AS LaggedCumFreqL1
FROM   dbo.PaperRefFieldPairs   AS P
LEFT   JOIN dbo.RefL4FieldPairsStats AS S
       ON  S.Field1 = P.Field1
       AND S.Field2 = P.Field2
       AND S.[Year] = P.[Year] - 1;          -- lag-1
GO

/* L1-b  …NN (null-to-zero)  ────────────────*/
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL1NN', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL1NN;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL1NN AS
SELECT PaperId,
       [Year],
       Field1,
       Field2,
       COALESCE(FreqPerYearL1 ,0) AS FreqPerYearL1NN,
       COALESCE(LaggedCumFreqL1,0) AS LaggedCumFreqL1NN
FROM   dbo.PaperRefL4FieldPairsExpandedL1;
GO

/* L1-c  Stats per paper  ───────────────────*/
IF OBJECT_ID(N'dbo.PaperRefFieldPairsStatsL1', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefFieldPairsStatsL1;
GO
CREATE VIEW dbo.PaperRefFieldPairsStatsL1 AS
SELECT DISTINCT
       PaperId,
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL1NN)
           OVER (PARTITION BY PaperId) AS MedianRefPairFreqL1,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY FreqPerYearL1NN)
           OVER (PARTITION BY PaperId) AS Top10PRefPairFreqL1,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL1NN)
           OVER (PARTITION BY PaperId) AS Top5PRefPairFreqL1,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY FreqPerYearL1NN)
           OVER (PARTITION BY PaperId) AS Top90PRefPairFreqL1,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL1NN)
           OVER (PARTITION BY PaperId) AS Top95PRefPairFreqL1,

       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL1NN)
           OVER (PARTITION BY PaperId) AS MedianRefPairCumFreqL1,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY LaggedCumFreqL1NN)
           OVER (PARTITION BY PaperId) AS Top10PRefPairCumFreqL1,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL1NN)
           OVER (PARTITION BY PaperId) AS Top5PRefPairCumFreqL1,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY LaggedCumFreqL1NN)
           OVER (PARTITION BY PaperId) AS Top90PRefPairCumFreqL1,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL1NN)
           OVER (PARTITION BY PaperId) AS Top95PRefPairCumFreqL1
FROM   dbo.PaperRefL4FieldPairsExpandedL1NN;
GO


/* L2-a */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL2', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL2;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL2 AS
SELECT  P.PaperId, P.[Year], P.Field1, P.Field2,
        S.FreqPerYear   AS FreqPerYearL2,
        S.LaggedCumFreq AS LaggedCumFreqL2
FROM    dbo.PaperRefFieldPairs P
LEFT    JOIN dbo.RefL4FieldPairsStats S
          ON S.Field1=P.Field1 AND S.Field2=P.Field2
         AND S.[Year]=P.[Year]-2;
GO

/* L2-b */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL2NN', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL2NN;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL2NN AS
SELECT PaperId,[Year],Field1,Field2,
       COALESCE(FreqPerYearL2 ,0) AS FreqPerYearL2NN,
       COALESCE(LaggedCumFreqL2,0) AS LaggedCumFreqL2NN
FROM   dbo.PaperRefL4FieldPairsExpandedL2;
GO

/* L2-c */
IF OBJECT_ID(N'dbo.PaperRefFieldPairsStatsL2', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefFieldPairsStatsL2;
GO
CREATE VIEW dbo.PaperRefFieldPairsStatsL2 AS
SELECT DISTINCT PaperId,
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL2NN)
           OVER(PARTITION BY PaperId) AS MedianRefPairFreqL2,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY FreqPerYearL2NN)
           OVER(PARTITION BY PaperId) AS Top10PRefPairFreqL2,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL2NN)
           OVER(PARTITION BY PaperId) AS Top5PRefPairFreqL2,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY FreqPerYearL2NN)
           OVER(PARTITION BY PaperId) AS Top90PRefPairFreqL2,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL2NN)
           OVER(PARTITION BY PaperId) AS Top95PRefPairFreqL2,

       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL2NN)
           OVER(PARTITION BY PaperId) AS MedianRefPairCumFreqL2,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY LaggedCumFreqL2NN)
           OVER(PARTITION BY PaperId) AS Top10PRefPairCumFreqL2,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL2NN)
           OVER(PARTITION BY PaperId) AS Top5PRefPairCumFreqL2,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY LaggedCumFreqL2NN)
           OVER(PARTITION BY PaperId) AS Top90PRefPairCumFreqL2,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL2NN)
           OVER(PARTITION BY PaperId) AS Top95PRefPairCumFreqL2
FROM   dbo.PaperRefL4FieldPairsExpandedL2NN;
GO


/* L3-a */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL3', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL3;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL3 AS
SELECT  P.PaperId,P.[Year],P.Field1,P.Field2,
        S.FreqPerYear   AS FreqPerYearL3,
        S.LaggedCumFreq AS LaggedCumFreqL3
FROM    dbo.PaperRefFieldPairs P
LEFT    JOIN dbo.RefL4FieldPairsStats S
          ON S.Field1=P.Field1 AND S.Field2=P.Field2
         AND S.[Year]=P.[Year]-3;
GO

/* L3-b */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL3NN', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL3NN;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL3NN AS
SELECT PaperId,[Year],Field1,Field2,
       COALESCE(FreqPerYearL3 ,0) AS FreqPerYearL3NN,
       COALESCE(LaggedCumFreqL3,0) AS LaggedCumFreqL3NN
FROM   dbo.PaperRefL4FieldPairsExpandedL3;
GO

/* L3-c */
IF OBJECT_ID(N'dbo.PaperRefFieldPairsStatsL3', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefFieldPairsStatsL3;
GO
CREATE VIEW dbo.PaperRefFieldPairsStatsL3 AS
SELECT DISTINCT PaperId,
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL3NN)
           OVER(PARTITION BY PaperId) AS MedianRefPairFreqL3,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY FreqPerYearL3NN)
           OVER(PARTITION BY PaperId) AS Top10PRefPairFreqL3,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL3NN)
           OVER(PARTITION BY PaperId) AS Top5PRefPairFreqL3,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY FreqPerYearL3NN)
           OVER(PARTITION BY PaperId) AS Top90PRefPairFreqL3,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL3NN)
           OVER(PARTITION BY PaperId) AS Top95PRefPairFreqL3,

       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL3NN)
           OVER(PARTITION BY PaperId) AS MedianRefPairCumFreqL3,
       PERCENTILE_DISC(0.1)  WITHIN GROUP (ORDER BY LaggedCumFreqL3NN)
           OVER(PARTITION BY PaperId) AS Top10PRefPairCumFreqL3,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL3NN)
           OVER(PARTITION BY PaperId) AS Top5PRefPairCumFreqL3,
       PERCENTILE_DISC(0.9)  WITHIN GROUP (ORDER BY LaggedCumFreqL3NN)
           OVER(PARTITION BY PaperId) AS Top90PRefPairCumFreqL3,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL3NN)
           OVER(PARTITION BY PaperId) AS Top95PRefPairCumFreqL3
FROM   dbo.PaperRefL4FieldPairsExpandedL3NN;
GO

/* L4-a */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL4', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL4;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL4 AS
SELECT  P.PaperId,P.[Year],P.Field1,P.Field2,
        S.FreqPerYear   AS FreqPerYearL4,
        S.LaggedCumFreq AS LaggedCumFreqL4
FROM    dbo.PaperRefFieldPairs P
LEFT    JOIN dbo.RefL4FieldPairsStats S
          ON S.Field1=P.Field1 AND S.Field2=P.Field2
         AND S.[Year]=P.[Year]-4;
GO

/* L4-b */
IF OBJECT_ID(N'dbo.PaperRefL4FieldPairsExpandedL4NN', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefL4FieldPairsExpandedL4NN;
GO
CREATE VIEW dbo.PaperRefL4FieldPairsExpandedL4NN AS
SELECT PaperId,[Year],Field1,Field2,
       COALESCE(FreqPerYearL4 ,0) AS FreqPerYearL4NN,
       COALESCE(LaggedCumFreqL4,0) AS LaggedCumFreqL4NN
FROM   dbo.PaperRefL4FieldPairsExpandedL4;
GO


/* L4-c  ─── PaperRefFieldPairsStatsL4 */
IF OBJECT_ID(N'dbo.PaperRefFieldPairsStatsL4', N'V') IS NOT NULL
    DROP VIEW dbo.PaperRefFieldPairsStatsL4;
GO
CREATE VIEW dbo.PaperRefFieldPairsStatsL4 AS
SELECT DISTINCT
       PaperId,

       /* —— frequency percentiles —— */
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY FreqPerYearL4NN)
           OVER (PARTITION BY PaperId) AS MedianRefPairFreqL4,
       PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY FreqPerYearL4NN)
           OVER (PARTITION BY PaperId) AS Top10PRefPairFreqL4,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY FreqPerYearL4NN)
           OVER (PARTITION BY PaperId) AS Top5PRefPairFreqL4,
       PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY FreqPerYearL4NN)
           OVER (PARTITION BY PaperId) AS Top90PRefPairFreqL4,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY FreqPerYearL4NN)
           OVER (PARTITION BY PaperId) AS Top95PRefPairFreqL4,

       /* —— cumulative-frequency percentiles —— */
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY LaggedCumFreqL4NN)
           OVER (PARTITION BY PaperId) AS MedianRefPairCumFreqL4,
       PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY LaggedCumFreqL4NN)
           OVER (PARTITION BY PaperId) AS Top10PRefPairCumFreqL4,
       PERCENTILE_DISC(0.05) WITHIN GROUP (ORDER BY LaggedCumFreqL4NN)
           OVER (PARTITION BY PaperId) AS Top5PRefPairCumFreqL4,
       PERCENTILE_DISC(0.9) WITHIN GROUP (ORDER BY LaggedCumFreqL4NN)
           OVER (PARTITION BY PaperId) AS Top90PRefPairCumFreqL4,
       PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY LaggedCumFreqL4NN)
           OVER (PARTITION BY PaperId) AS Top95PRefPairCumFreqL4
FROM   dbo.PaperRefL4FieldPairsExpandedL4NN;
GO

/* ------------- PaperDataMergedView ------------------- */
IF OBJECT_ID(N'dbo.PaperDataMergedView', N'V') IS NOT NULL
    DROP VIEW dbo.PaperDataMergedView;
GO
CREATE VIEW dbo.PaperDataMergedView AS
SELECT
    /* ---------- base ---------- */
    F.*,                                 -- every column from PaperDataFlagged

    /* ---------- author info ---------- */
    AC.AuthorCount                    AS AuthorCount,
    FA.AuthorId                       AS FirstAuthorId,
    LA.AuthorId                       AS LastAuthorId,

    /* ---------- top-field ---------- */
    TF.OGFieldOfStudyId               AS TopFieldId,

    /* ---------- level-1 stats ---------- */
    S1.MedianRefPairFreqL1,
    S1.Top10PRefPairFreqL1,
    S1.Top5PRefPairFreqL1,
    S1.Top90PRefPairFreqL1,
    S1.Top95PRefPairFreqL1,
    S1.MedianRefPairCumFreqL1,
    S1.Top10PRefPairCumFreqL1,
    S1.Top5PRefPairCumFreqL1,
    S1.Top90PRefPairCumFreqL1,
    S1.Top95PRefPairCumFreqL1,

    /* ---------- level-2 stats ---------- */
    S2.MedianRefPairFreqL2,
    S2.Top10PRefPairFreqL2,
    S2.Top5PRefPairFreqL2,
    S2.Top90PRefPairFreqL2,
    S2.Top95PRefPairFreqL2,
    S2.MedianRefPairCumFreqL2,
    S2.Top10PRefPairCumFreqL2,
    S2.Top5PRefPairCumFreqL2,
    S2.Top90PRefPairCumFreqL2,
    S2.Top95PRefPairCumFreqL2,

    /* ---------- level-3 stats ---------- */
    S3.MedianRefPairFreqL3,
    S3.Top10PRefPairFreqL3,
    S3.Top5PRefPairFreqL3,
    S3.Top90PRefPairFreqL3,
    S3.Top95PRefPairFreqL3,
    S3.MedianRefPairCumFreqL3,
    S3.Top10PRefPairCumFreqL3,
    S3.Top5PRefPairCumFreqL3,
    S3.Top90PRefPairCumFreqL3,
    S3.Top95PRefPairCumFreqL3,

    /* ---------- level-4 stats ---------- */
    S4.MedianRefPairFreqL4,
    S4.Top10PRefPairFreqL4,
    S4.Top5PRefPairFreqL4,
    S4.Top90PRefPairFreqL4,
    S4.Top95PRefPairFreqL4,
    S4.MedianRefPairCumFreqL4,
    S4.Top10PRefPairCumFreqL4,
    S4.Top5PRefPairCumFreqL4,
    S4.Top90PRefPairCumFreqL4,
    S4.Top95PRefPairCumFreqL4

FROM   dbo.PaperDataFlagged               AS F
LEFT   JOIN dbo.PaperAuthorCount          AS AC ON F.PaperId = AC.PaperId
LEFT   JOIN dbo.PaperFirstAuthor          AS FA ON F.PaperId = FA.PaperId
LEFT   JOIN dbo.PaperLastAuthor           AS LA ON F.PaperId = LA.PaperId
LEFT   JOIN dbo.PaperTopField             AS TF ON F.PaperId = TF.PaperId
LEFT   JOIN dbo.PaperRefFieldPairsStatsL1 AS S1 ON F.PaperId = S1.PaperId
LEFT   JOIN dbo.PaperRefFieldPairsStatsL2 AS S2 ON F.PaperId = S2.PaperId
LEFT   JOIN dbo.PaperRefFieldPairsStatsL3 AS S3 ON F.PaperId = S3.PaperId
LEFT   JOIN dbo.PaperRefFieldPairsStatsL4 AS S4 ON F.PaperId = S4.PaperId;
GO

-- Build an internal, distributed table from your view
IF OBJECT_ID(N'dbo.PaperDataMerged_int','U') IS NOT NULL
    DROP TABLE dbo.PaperDataMerged_int;
GO
CREATE TABLE dbo.PaperDataMerged_int
WITH (
    DISTRIBUTION = HASH(PaperId),          -- join/key column
    CLUSTERED COLUMNSTORE INDEX            -- compress + less spill
)
AS
SELECT *
FROM dbo.PaperDataMergedView
OPTION (LABEL = 'Build_PaperDataMerged_int');   -- optional label
GO


-- Export to Parquet (CETAS) from the internal table
IF EXISTS (SELECT 1 FROM sys.external_tables
           WHERE name = N'PaperDataMerged' AND schema_id = SCHEMA_ID(N'dbo'))
    EXEC (N'DROP EXTERNAL TABLE dbo.PaperDataMerged;');
GO

CREATE EXTERNAL TABLE dbo.PaperDataMerged
WITH (
    LOCATION    = '/Output/run1/PaperDataMerged/',
    DATA_SOURCE = MyBlob,
    FILE_FORMAT = ParquetFmt
)
AS
SELECT *
FROM dbo.PaperDataMerged_int
OPTION (LABEL = 'Export_PaperDataMerged');
GO


